#Data analysis on craft beers and breweries accross the United States. #Project is created with: R 4.1.2 #Packages: caret, class, gganimate, ggrepel, ggthemes, gifski, kableExtra, maps, tidyverse, tigris, viridis
#Question 2 & 3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#reading in the csvs
brewery = read.csv("./Data Files/Breweries.csv") #read breweries.csv
beer = read.csv("./Data Files/Beers.csv") #read beers.csv
#renaming columns for the join to the beer dataset
brewery=rename(brewery, Brewery_id=Brew_ID, Brewery=Name)
brewery$State <- trimws(brewery$State, which = c("left"))#added to remove space before state
#pulling ids with inaccurate info
Brewery_id<-as.integer(c(96,415,167, 262, 139))
error_brew_ids<-as.data.frame(Brewery_id)
#joining to get Brewery Names
brew_filter = merge(x=error_brew_ids,y=brewery,by="Brewery_id") %>% select(Brewery_id, Brewery)
#joining on brewery name to pull in correct columns--filter out inaccurate
brew_filter = merge(x=brewery,y=brew_filter,by="Brewery") %>% mutate(type = case_when( Brewery_id.x==Brewery_id.y ~ "correct",TRUE~ "wrong"))
#filtering down to rows with errors
brew_replace =brew_filter %>%filter(type == "wrong") %>% select(Brewery_id.y, Brewery, City, State)%>% rename(Brewery_id=Brewery_id.y)
#removing errors and replacing with updated df
brewery = anti_join(x=brewery,y=error_brew_ids,by="Brewery_id")
brewery<-rbind(brewery, brew_replace)
brewery$City = str_replace(brewery$City, "Menominie", "Menomonie")
#join between brewery and beer data
beer_brewery = merge(x=brewery,y=beer,by="Brewery_id")
#print first 6 rows
head(beer_brewery)
## Brewery_id Brewery City State Name Beer_ID ABV
## 1 1 NorthGate Brewing Minneapolis MN Pumpion 2689 0.060
## 2 1 NorthGate Brewing Minneapolis MN Stronghold 2688 0.060
## 3 1 NorthGate Brewing Minneapolis MN Parapet ESB 2687 0.056
## 4 1 NorthGate Brewing Minneapolis MN Get Together 2692 0.045
## 5 1 NorthGate Brewing Minneapolis MN Maggie's Leap 2691 0.049
## 6 1 NorthGate Brewing Minneapolis MN Wall's End 2690 0.048
## IBU Style Ounces
## 1 38 Pumpkin Ale 16
## 2 25 American Porter 16
## 3 47 Extra Special / Strong Bitter (ESB) 16
## 4 50 American IPA 16
## 5 26 Milk / Sweet Stout 16
## 6 19 English Brown Ale 16
#print last 6 rows
tail(beer_brewery)
## Brewery_id Brewery City State
## 2405 556 Ukiah Brewing Company Ukiah CA
## 2406 557 Butternuts Beer and Ale Garrattsville NY
## 2407 557 Butternuts Beer and Ale Garrattsville NY
## 2408 557 Butternuts Beer and Ale Garrattsville NY
## 2409 557 Butternuts Beer and Ale Garrattsville NY
## 2410 558 Sleeping Lady Brewing Company Anchorage AK
## Name Beer_ID ABV IBU Style Ounces
## 2405 Pilsner Ukiah 98 0.055 NA German Pilsener 12
## 2406 Porkslap Pale Ale 49 0.043 NA American Pale Ale (APA) 12
## 2407 Snapperhead IPA 51 0.068 NA American IPA 12
## 2408 Moo Thunder Stout 50 0.049 NA Milk / Sweet Stout 12
## 2409 Heinnieweisse Weissebier 52 0.049 NA Hefeweizen 12
## 2410 Urban Wilderness Pale Ale 30 0.049 NA English Pale Ale 12
#3. Address the missing values in each column.
#filtering of NAs for ABV and IBU-- rows drop from 2410 down to 1405 for this filtering
#only use this filtered dataset for the questions related to ABV/IBU#only use this filtered dataset for the questions related to ABV/IBU
clean_beer_brewery <- beer_brewery %>% filter_at(vars(ABV,IBU),all_vars(!is.na(.)))
#question 1
#How many breweries are present in each state? (counties heatmap)
#load libraries
library(ggplot2)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
#library(tidyverse)#added to 1st chunk
library(ggthemes)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
#load counties table
cnty <- read.csv("./Data Files/cty-cnty.csv")
colnames(cnty)[3] <- "region"
#load population data 2021
pop_est <- read.csv("./Data Files/NST-EST2021-alldata.csv")
pop_est <- pop_est %>% select(NAME,POPESTIMATE2021) %>% rename(region=NAME, "Pop2021"=POPESTIMATE2021) %>% mutate(region=tolower(region))
pop_est <- pop_est[-c(1:5),]
pop_est$region %>% str_replace("District of Columbia", "washington, d.c.")
## [1] "alabama" "alaska" "arizona"
## [4] "arkansas" "california" "colorado"
## [7] "connecticut" "delaware" "district of columbia"
## [10] "florida" "georgia" "hawaii"
## [13] "idaho" "illinois" "indiana"
## [16] "iowa" "kansas" "kentucky"
## [19] "louisiana" "maine" "maryland"
## [22] "massachusetts" "michigan" "minnesota"
## [25] "mississippi" "missouri" "montana"
## [28] "nebraska" "nevada" "new hampshire"
## [31] "new jersey" "new mexico" "new york"
## [34] "north carolina" "north dakota" "ohio"
## [37] "oklahoma" "oregon" "pennsylvania"
## [40] "rhode island" "south carolina" "south dakota"
## [43] "tennessee" "texas" "utah"
## [46] "vermont" "virginia" "washington"
## [49] "west virginia" "wisconsin" "wyoming"
## [52] "puerto rico"
pop_est$rank <- rank(-pop_est$Pop2021)
#wrangle data
#brewery$State <- trimws(brewery$State, which = c("left"))
cnty <- cnty %>% select("City", "State","County","region")#select columns
cnty <- cnty[!duplicated(cnty),]
#left join brew and cnty
brewcomb <- merge(brewery, select(cnty, c("City", "State", "County","region")), by=c("City","State"))
brewcomb <- brewcomb %>% distinct(Brewery_id, .keep_all = TRUE)
#1 table wtih state count, map with state count and map with county count
#state and county tables
us_states <- map_data("state")
us_counties <- map_data("county")
#state brewery count
brew_state <- brewcomb %>% mutate(region=tolower(region)) %>%
group_by(region) %>% count(region)
brew_state <- brew_state %>% left_join(pop_est,by="region") %>% arrange(desc(n))
brew_state %>% mutate(region=str_to_title(region),Pop2021=round(Pop2021/1000000,2)) %>% kable(col.names = c("State","Breweries","Population 2021","Pop. Rank")) %>%
kable_styling(latex_options = c("striped", "scale_down")) %>%
row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
column_spec(1:2, width = "0.5in")
| State | Breweries | Population 2021 | Pop. Rank |
|---|---|---|---|
| Colorado | 47 | 5.81 | 21 |
| California | 39 | 39.24 | 1 |
| Michigan | 33 | 10.05 | 10 |
| Oregon | 29 | 4.25 | 27 |
| Texas | 28 | 29.53 | 2 |
| Pennsylvania | 25 | 12.96 | 5 |
| Washington | 23 | 7.74 | 13 |
| Indiana | 22 | 6.81 | 17 |
| Massachusetts | 22 | 6.98 | 15 |
| Wisconsin | 20 | 5.90 | 20 |
| North Carolina | 19 | 10.55 | 9 |
| Illinois | 18 | 12.67 | 6 |
| New York | 16 | 19.84 | 4 |
| Virginia | 16 | 8.64 | 12 |
| Florida | 15 | 21.78 | 3 |
| Ohio | 15 | 11.78 | 7 |
| Minnesota | 12 | 5.71 | 22 |
| Arizona | 11 | 7.28 | 14 |
| Vermont | 10 | 0.65 | 51 |
| Maine | 9 | 1.37 | 43 |
| Missouri | 9 | 6.17 | 18 |
| Montana | 9 | 1.10 | 44 |
| Connecticut | 8 | 3.61 | 29 |
| Alaska | 7 | 0.73 | 49 |
| Georgia | 7 | 10.80 | 8 |
| Maryland | 7 | 6.17 | 19 |
| Oklahoma | 6 | 3.99 | 28 |
| Idaho | 5 | 1.90 | 39 |
| Iowa | 5 | 3.19 | 32 |
| Louisiana | 5 | 4.62 | 25 |
| Nebraska | 5 | 1.96 | 38 |
| Rhode Island | 5 | 1.10 | 45 |
| Hawaii | 4 | 1.44 | 41 |
| Kentucky | 4 | 4.51 | 26 |
| New Mexico | 4 | 2.12 | 37 |
| South Carolina | 4 | 5.19 | 23 |
| Utah | 4 | 3.34 | 30 |
| Wyoming | 4 | 0.58 | 52 |
| Alabama | 3 | 5.04 | 24 |
| Kansas | 3 | 2.93 | 36 |
| New Hampshire | 3 | 1.39 | 42 |
| New Jersey | 3 | 9.27 | 11 |
| Tennessee | 3 | 6.98 | 16 |
| Arkansas | 2 | 3.03 | 34 |
| Delaware | 2 | 1.00 | 46 |
| Mississippi | 2 | 2.95 | 35 |
| Nevada | 2 | 3.14 | 33 |
| North Dakota | 1 | 0.77 | 48 |
| South Dakota | 1 | 0.90 | 47 |
| Washington, D.c. | 1 | NA | NA |
| West Virginia | 1 | 1.78 | 40 |
#state gradient map
us_states %>% left_join(brew_state,by=("region")) %>%
ggplot(aes(x=long,y=lat,group=group,fill=n))+
geom_polygon(color = "gray90", size=.1)+
coord_map(projection = "albers", lat0=45, lat1=55)+
scale_fill_viridis_c()+
theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.background = element_blank(),
axis.title = element_blank())+
labs(fill="Brewery\nCount")+
ggtitle("Breweries by State\nContinental US")
#county brewery count
brew_county <- brewcomb %>% mutate(subregion=tolower(County)) %>%
group_by(subregion) %>% count(subregion)
#county gradient map
us_counties %>% left_join(brew_county,by="subregion") %>%
ggplot()+
geom_polygon(aes(x=long,y=lat,group=group,fill=n),color = "gray90",size=.1)+
coord_map(projection = "albers",lat0=45,lat1=55)+
scale_fill_viridis_c()+
theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.background = element_blank(),
axis.title = element_blank())+
labs(fill="Brewery\nCount")+
ggtitle("Breweries by County\nContinental US")+
geom_polygon(data = us_states,aes(x=long,y=lat,group=group), color = "white",size=.15,fill="transparent")
#The table above shows the breweries present in each state, the top 5 states are Colorado (47), California (39), Michigan (33), Oregion (29), and Texas (28).
#Question 4
#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
#ABV
bb<-clean_beer_brewery %>% group_by(State) %>% summarize(medianABV = median(ABV), count = n()) %>% ggplot(aes(x = reorder(State, -medianABV), y = medianABV, fill=medianABV)) + geom_col()+
xlab('State') + ylab('Median ABV')+ggtitle("Median ABV by State") +theme(axis.text = element_text(size = 6))+ scale_fill_gradient(low='#46085C', high='#4FBA6E', name='Median ABV')
bb + scale_y_continuous(labels = scales::percent)
#Question 4
#DC and Kentucky have the highest Median ABV. Both states have a high % of brands with the American Double / Imperial IPA, which has #an ABV between 7-14%
#IBU
bb<-clean_beer_brewery %>% group_by(State) %>% summarize(medianABV = median(ABV), medianIBU=round(median(IBU),0), count = n()) %>% ggplot(aes(x = reorder(State, -medianABV), y = medianIBU, fill=medianIBU)) + geom_col()+
xlab('State') + ylab('Median IBU')+ggtitle("Median IBU by State") +theme(axis.text = element_text(size = 6))+ scale_fill_gradient(low='#46085C', high='#4FBA6E', name='Median IBU')
bb
#Question 4
#Maine and West Virginia have the highest Median IBU. Maine is known for the IPAs, which indexes higher on IBU.
#combined
bb<-clean_beer_brewery %>% group_by(State) %>% summarize(medianABV = median(ABV), medianIBU=round(median(IBU),0), count = n()) %>% ggplot(aes(x = reorder(State, -medianABV), y = medianABV, fill=medianIBU)) + geom_col()+
xlab('State') + ylab('Median ABV')+ggtitle("Median IBU/ABV by State") +theme(axis.text = element_text(size = 6))+ scale_fill_gradient(low='#46085C', high='#4FBA6E', name='Median IBU')
bb + scale_y_continuous(labels = scales::percent)
#Question 4
#Delaware and New Hampshire have some of the highest IBU for their ABV. There are some articles from Delaware about "table beer" and #low ABV fruit beers.
#However, both states combined represent <1% of the data of the overall data.
#5 which state has the maximum alcoholic (ABV) beer? which state has the most bitter (IBU) beer?
library(kableExtra)
#maximum ABV by state
maxABV <- beer_brewery %>% merge(cnty, by=c("City", "State")) %>% select(ABV, region) %>%
filter(!is.na(ABV)) %>% group_by(region) %>% summarise(Max_ABV=max(ABV)) %>% arrange(desc(Max_ABV))
maxABV %>% kable(col.names = c("State", "Max ABV")) %>%
kable_styling(latex_options = c("striped", "scale_down")) %>%
row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
column_spec(1:2, width = "0.5in")
| State | Max ABV |
|---|---|
| Colorado | 0.128 |
| Kentucky | 0.125 |
| Indiana | 0.120 |
| New York | 0.100 |
| California | 0.099 |
| Idaho | 0.099 |
| Maine | 0.099 |
| Massachusetts | 0.099 |
| Michigan | 0.099 |
| Minnesota | 0.099 |
| Nevada | 0.099 |
| New Jersey | 0.099 |
| North Carolina | 0.099 |
| Ohio | 0.099 |
| Pennsylvania | 0.099 |
| Texas | 0.099 |
| Wisconsin | 0.099 |
| South Carolina | 0.097 |
| Illinois | 0.096 |
| Nebraska | 0.096 |
| Vermont | 0.096 |
| Arizona | 0.095 |
| Iowa | 0.095 |
| Alabama | 0.093 |
| Washington, D.C. | 0.092 |
| Connecticut | 0.090 |
| Utah | 0.090 |
| Louisiana | 0.088 |
| Oregon | 0.088 |
| Virginia | 0.088 |
| Rhode Island | 0.086 |
| Kansas | 0.085 |
| Maryland | 0.085 |
| Oklahoma | 0.085 |
| Washington | 0.084 |
| Hawaii | 0.083 |
| Florida | 0.082 |
| Mississippi | 0.080 |
| Missouri | 0.080 |
| New Mexico | 0.080 |
| Montana | 0.075 |
| Georgia | 0.072 |
| Wyoming | 0.072 |
| South Dakota | 0.069 |
| Alaska | 0.068 |
| North Dakota | 0.067 |
| West Virginia | 0.067 |
| New Hampshire | 0.065 |
| Tennessee | 0.062 |
| Arkansas | 0.061 |
| Delaware | 0.055 |
#Maximum IBU by state
maxIBU <- beer_brewery %>% merge(cnty, by=c("City", "State")) %>% select(IBU, region) %>%
filter(!is.na(IBU)) %>% group_by(region) %>% summarise(Max_IBU=max(IBU)) %>% arrange(desc(Max_IBU))
maxIBU %>% kable(col.names = c("State","Max IBU")) %>%
kable_styling(latex_options = c("striped", "scale_down")) %>%
row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
column_spec(1:2, width = "0.5in")
| State | Max IBU |
|---|---|
| Oregon | 138 |
| Virginia | 135 |
| Massachusetts | 130 |
| Ohio | 126 |
| Minnesota | 120 |
| Vermont | 120 |
| Texas | 118 |
| California | 115 |
| Indiana | 115 |
| Michigan | 115 |
| Washington, D.C. | 115 |
| Pennsylvania | 113 |
| New York | 111 |
| Kansas | 110 |
| Colorado | 104 |
| Alabama | 103 |
| Idaho | 100 |
| Illinois | 100 |
| New Jersey | 100 |
| New Mexico | 100 |
| Oklahoma | 100 |
| Arizona | 99 |
| Iowa | 99 |
| North Carolina | 98 |
| Maryland | 90 |
| Nevada | 90 |
| Missouri | 89 |
| Connecticut | 85 |
| Utah | 83 |
| Washington | 83 |
| Florida | 82 |
| New Hampshire | 82 |
| Kentucky | 80 |
| Mississippi | 80 |
| Montana | 80 |
| Wisconsin | 80 |
| Hawaii | 75 |
| Rhode Island | 75 |
| Wyoming | 75 |
| Alaska | 71 |
| West Virginia | 71 |
| Maine | 70 |
| North Dakota | 70 |
| Georgia | 65 |
| Nebraska | 65 |
| South Carolina | 65 |
| Tennessee | 61 |
| Louisiana | 60 |
| Delaware | 52 |
| Arkansas | 39 |
#US map for Max ABV by state
us_states %>% mutate(region=str_to_title(region)) %>% left_join(maxABV,by="region") %>%
ggplot(aes(x=long,y=lat,group=group, fill = Max_ABV))+
geom_polygon(color = "gray90", size=.1)+
coord_map(projection = "albers", lat0=45, lat1=55)+
scale_fill_viridis_c()+
theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.background = element_blank(),
axis.title = element_blank())+
labs(fill="Maximum\nAlcohol by Volume")+
ggtitle("Maximum ABV by State\nContinental US")
#US map for max IBU by state
us_states %>% mutate(region=str_to_title(region)) %>% left_join(maxIBU,by="region") %>%
ggplot(aes(x=long,y=lat,group=group, fill = Max_IBU))+
geom_polygon(color = "gray90", size=.1)+
coord_map(projection = "albers", lat0=45, lat1=55)+
scale_fill_viridis_c()+
theme(axis.line = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.background = element_blank(),
axis.title = element_blank())+
labs(fill="Maximum\nIBU")+
ggtitle("Maximum IBU by State\nContinental US")
#Colorado is the state with the beer with the maximum alcohol by volume (ABV) with .128. Oregon is the state with the beer with the maximum International Bitterness Units (IBU) with .138.
#Question 6
#6 comment on the summary statistics and distribution of the ABV variable
#Density Curve for ABV
beer_brewery %>% ggplot(aes(x=ABV))+geom_density(color="#39568C")+theme_bw()+theme(
panel.grid = element_blank(),
panel.border = element_blank())
## Warning: Removed 62 rows containing non-finite values (stat_density).
#Boxplot for ABV
beer_brewery %>% ggplot(aes(x=ABV))+geom_boxplot()
## Warning: Removed 62 rows containing non-finite values (stat_boxplot).
#5 number summary and mean for ABV
ABVsum <- beer_brewery %>% filter(!is.na(ABV)) %>%
select(ABV) %>%
summarise(min=min(ABV),
"25%"=quantile(ABV,.25),
median=median(ABV),
"75%"=quantile(ABV,.75),
max=max(ABV),mean=mean(ABV))
ABVsum %>% kable(col.names = c("Min", "25%", "Med", "75%", "Max", "Mean")) %>%
kable_styling(latex_options = c("striped", "scale_down")) %>%
row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
column_spec(1:2, width = "0.5in")
| Min | 25% | Med | 75% | Max | Mean |
|---|---|---|---|---|---|
| 0.001 | 0.05 | 0.056 | 0.067 | 0.128 | 0.0597734 |
#right skewed distribution centered at .057, there seems to be some degree of rounding occuring at around .1,
#given that there are several beers that are exactly at the line with fewer slightly below and almost none
#slightly above the number. There also seems to be some degree of rounding at .5, but this is less pronounced
#right skewed distribution centered at .057, there seems to be some degree of rounding occuring at around .1, given that there are several beers that are exactly at the line with fewer slightly below and almost none slightly above the number. There also seems to be some degree of rounding at .5, but this is less pronounced
#Question 7
#7.Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.
#comparison of ABV/IBU in scatter plot with regression line
#regression line
bb.lm <- lm(ABV~ IBU, beer_brewery)
bb = clean_beer_brewery %>% ggplot(aes(x = ABV, y=IBU, color = IBU)) + geom_point(aes(fill=IBU, color=IBU),pch=21,size=3,colour="black")+ ggtitle("International Bitterness Units vs. Alchohol by Volume")+
geom_smooth(method='lm', formula= y~x, color="black")
bb + scale_x_continuous(labels = scales::percent)
str(summary(bb.lm))
## List of 12
## $ call : language lm(formula = ABV ~ IBU, data = beer_brewery)
## $ terms :Classes 'terms', 'formula' language ABV ~ IBU
## .. ..- attr(*, "variables")= language list(ABV, IBU)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "ABV" "IBU"
## .. .. .. ..$ : chr "IBU"
## .. ..- attr(*, "term.labels")= chr "IBU"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(ABV, IBU)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "ABV" "IBU"
## $ residuals : Named num [1:1405] 0.00174 0.0063 -0.00542 -0.01747 -0.00505 ...
## ..- attr(*, "names")= chr [1:1405] "1" "2" "3" "4" ...
## $ coefficients : num [1:2, 1:4] 4.49e-02 3.51e-04 5.18e-04 1.04e-05 8.68e+01 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "IBU"
## .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
## $ aliased : Named logi [1:2] FALSE FALSE
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "IBU"
## $ sigma : num 0.0101
## $ df : int [1:3] 2 1403 2
## $ r.squared : num 0.45
## $ adj.r.squared: num 0.449
## $ fstatistic : Named num [1:3] 1147 1 1403
## ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
## $ cov.unscaled : num [1:2, 1:2] 2.64e-03 -4.52e-05 -4.52e-05 1.06e-06
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "(Intercept)" "IBU"
## .. ..$ : chr [1:2] "(Intercept)" "IBU"
## $ na.action : 'omit' Named int [1:1005] 19 35 36 37 38 39 40 41 42 43 ...
## ..- attr(*, "names")= chr [1:1005] "19" "35" "36" "37" ...
## - attr(*, "class")= chr "summary.lm"
#Question 7
#There is evidence of a positive relationship between ABV and IBU, based on the regression line.
#R-Squared has a value of 0.45, meaning ABV explains 45% of the variation in IBU.
#Question 8
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(class)
#Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.
#add ipa, ale, other column
ipa_ale <-
clean_beer_brewery %>% mutate(Ale_type = ifelse(
str_detect(clean_beer_brewery$Style, regex("ipa", ignore_case = TRUE)),
"IPA",
ifelse(str_detect(
clean_beer_brewery$Style, regex("ale", ignore_case = TRUE)
), "Ale", "other")
))
#filter out "other" type of beers and drop levels
ipa_ale <- ipa_ale %>% filter(Ale_type %in% c("IPA", "Ale")) %>% droplevels(ipa_ale$Ale_type)
#for loops for KNN model and hypertuning k parameter
nsplits <- 100
nks <- 30
df <- data.frame()
set.seed(6)
for (i in 1:nsplits) {
n <- nrow(ipa_ale)
nsamp <- sample(1:n,round(n*.7)) #for a 70-30 split
ipa_train <- ipa_ale[nsamp,]
ipa_test <- ipa_ale[-nsamp,]
for (j in 1:nks) {
cm <- knn(scale(ipa_train[,7:8]),scale(ipa_test[,7:8]),cl=ipa_train$Ale_type,k=i) %>% confusionMatrix(as.factor(ipa_test$Ale_type),positive = "IPA")
cm_values <- data.frame("k"=j,"Accuracy"=cm$overall[1],"Sensitivity"=cm$byClass[1],"Specificity"=cm$byClass[2])
df <- rbind(df,cm_values)
}
}
#summarize the k results
means <- df %>% group_by(k) %>% summarise(Accuracy=mean(Accuracy)*100,Sensitivity=mean(Sensitivity)*100,Specificity=mean(Specificity)*100)
#plot accuracy, sensitivity and specificity
means %>% ggplot(aes(x=k,y=Accuracy))+geom_line(color="#39568C")+theme_bw()+theme(
panel.grid = element_blank(),
panel.border = element_blank())
means %>% ggplot(aes(x=k,y=Sensitivity))+geom_line(color="#39568C")+theme_bw()+theme(
panel.grid = element_blank(),
panel.border = element_blank())
means %>% ggplot(aes(x=k,y=Specificity))+geom_line(color="#39568C")+theme_bw()+theme(
panel.grid = element_blank(),
panel.border = element_blank())
#After hypertuning the k parameter, we can see that looking at the 14 closest beers in terms of ABV and IBU would help us make a prediction on whether it’s a Indian Pale Ale or another type of Ale, with about 85% accuracy. #moreover this model is able to properly classify other ales as Ale (Specificity) with almost 86% accuracy, and classify IPA as IPA (sensitivity) with 83.64% accuracy.
#Question 9
#Knock their socks off! Find one other useful inference from the data that you feel Budweiser may be able to find value in. You must convince them why it is important and back up your conviction with appropriate statistical evidence.
#join brew/beer and cnty (bubble map data for brews)
library(tidyverse)
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:maps':
##
## unemp
us_counties <- map_data("county")
cnty<-cnty%>% select(City, State, region, County)
cnty<- cnty %>% distinct(City, State, .keep_all = TRUE)
us_counties<-us_counties %>% distinct(region, subregion, .keep_all = TRUE)
brewcomb_type <- merge(clean_beer_brewery, select(cnty, c("City", "State", "County","region")), by=c("City","State"))
brewcomb_type <- brewcomb_type %>% mutate(subregion=tolower(County),region=tolower(region))
#combining for lat long info
brewcomb_type <- merge(brewcomb_type, select(us_counties, c("lat", "long", "region","subregion")), by=c("region","subregion"))
#style text clean-up
library(stringr)
brewcomb_type$BeerStyle <- str_replace_all(brewcomb_type$Style, c(".*American.*IPA.*"="_American IPA", ".*American.*Ale.*"= "_American Ale", ".*Stout.*"="_Stout", ".*Lager.*"="_Lager", ".*Pilsner.*"= "_Pilsner", ".*Bitter.*"= "_Bitter", ".*Porter.*"= "_Porter", ".*Belgian.*"= "_German", ".*Bock.*"= "_German", ".*Czech.*"= "_German", ".*Doppelbock.*"= "_German", ".*German.*"= "_German", ".*Gose.*"= "_German", ".*Grisette.*"= "_German", ".*Hefeweizen.*"= "_German", ".*Kölsch.*"= "_German", ".*Maibock.*"= "_German", ".*Märzen.*"= "_German", ".*Munich.*"= "_German", ".*Saison.*"= "_German", ".*Vienna.*"= "_German", ".*Witbier.*"= "_German", ".*Altbier.*"= "_German", ".*Baltic.*"= "_German", ".*Berliner.*"= "_German", ".*Keller.*"= "_German", ".*Schwarzbier.*"= "_German"))
brewcomb_type$BeerStyle<-str_replace(brewcomb_type$BeerStyle,"^(?!_).*$", "")
brewcomb_type$SimpleStyle<-str_replace(brewcomb_type$Style,".*Ale.*", "_Ale")
brewcomb_type$SimpleStyle<-str_replace(brewcomb_type$SimpleStyle,"^(?!_).*$", "")
brewcomb_type$BeerStyle <- paste(brewcomb_type$BeerStyle, brewcomb_type$SimpleStyle)
brewcomb_type$BeerStyle<-str_replace(brewcomb_type$BeerStyle,".*_American Ale _Ale.*", "_American Ale")
brewcomb_type$BeerStyle <- trimws(brewcomb_type$BeerStyle, which = c("left"))#added to remove space before style
brewcomb_type$BeerStyle<-str_replace(brewcomb_type$BeerStyle,"^(?!_).*$", "Other")
brewcomb_type$BeerStyle <- str_replace_all(brewcomb_type$BeerStyle, c(".*_"=" ", ".*German.*Ale.*"="German Ale", ".*American.*IPA.*"="American IPA"))
brewcomb_type$BeerStyle <- trimws(brewcomb_type$BeerStyle, which = c("right"))#added to remove space after style
brewcomb_type$BeerStyle <- trimws(brewcomb_type$BeerStyle, which = c("left"))#added to remove space before style
#summary of beer styles
sum_brew<-brewcomb_type %>% group_by(BeerStyle) %>% summarize(meanabv = mean(ABV), meanIBU = mean(IBU), beers = n()) %>% arrange(desc(beers))
sum_brew
## # A tibble: 10 × 4
## BeerStyle meanabv meanIBU beers
## <chr> <dbl> <dbl> <int>
## 1 American Ale 0.0556 37.0 401
## 2 American IPA 0.0694 72.4 360
## 3 German 0.0522 23.9 158
## 4 Ale 0.0609 27.8 127
## 5 Lager 0.0504 23.5 79
## 6 Other 0.0621 27.1 79
## 7 Stout 0.0684 44.8 50
## 8 Porter 0.0631 33.8 40
## 9 Bitter 0.0551 43.8 16
## 10 Pilsner 0.0499 27.4 15
#summary table for powerpoint
sum_brew %>% kable(col.names = c("BeerStyle","meanabv","meanIBU","beers")) %>%
kable_styling(latex_options = c("striped", "scale_down")) %>%
row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
column_spec(1:2, width = "0.5in")
| BeerStyle | meanabv | meanIBU | beers |
|---|---|---|---|
| American Ale | 0.0555636 | 37.04489 | 401 |
| American IPA | 0.0694222 | 72.42222 | 360 |
| German | 0.0522342 | 23.86709 | 158 |
| Ale | 0.0609213 | 27.82677 | 127 |
| Lager | 0.0503797 | 23.45570 | 79 |
| Other | 0.0621392 | 27.12658 | 79 |
| Stout | 0.0683800 | 44.84000 | 50 |
| Porter | 0.0631250 | 33.75000 | 40 |
| Bitter | 0.0551250 | 43.75000 | 16 |
| Pilsner | 0.0499333 | 27.40000 | 15 |
#summary table for bubble graph/map
bb<-brewcomb_type %>% group_by(BeerStyle, lat, long) %>% summarize(meanabv = mean(ABV), meanIBU = mean(IBU), beers = n()) %>% arrange(desc(beers))
## `summarise()` has grouped output by 'BeerStyle', 'lat'. You can override using the `.groups` argument.
bb
## # A tibble: 637 × 6
## # Groups: BeerStyle, lat [609]
## BeerStyle lat long meanabv meanIBU beers
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American Ale 40.3 -106. 0.0639 54.0 22
## 2 American Ale 45.6 -122. 0.0469 32.8 20
## 3 American IPA 33.5 -118. 0.0719 75.7 17
## 4 American IPA 40.3 -106. 0.0796 70.1 15
## 5 American Ale 33.5 -118. 0.059 46.8 12
## 6 American Ale 47.6 -121. 0.0547 33.4 12
## 7 American IPA 28.2 -82.7 0.0707 66.3 12
## 8 American IPA 34.8 -119. 0.0745 71.1 11
## 9 American IPA 47.6 -121. 0.0593 59.5 11
## 10 Ale 39.6 -86.3 0.0724 25.8 10
## # … with 627 more rows
# Beer Style bubble map fixed
library(ggrepel)
usa <- map_data("state")
ggplot() +
geom_polygon(data = usa, aes(x=long, y = lat, group = group),color = "white", fill="grey", alpha=0.3) +
geom_point( data=bb, aes(x=long, y=lat, color=BeerStyle, size=beers, alpha=0.5)) +
scale_color_viridis(option="viridis", discrete=TRUE, name="Beer Style" ) +
theme_void() +
scale_size(range = c(1, 10), name=" # of beers")+
ggtitle("Number of Beers by Style")
#animation of bubble graph using map and simplified beer styles
library(gganimate)
library(gifski)
p <- ggplot() +
geom_polygon(data = usa, aes(x=long, y = lat, group = group),color = "white", fill="grey", alpha=0.3) +
geom_point( data=bb, aes(x=long, y=lat, color=BeerStyle, size=beers, alpha=0.5)) +
scale_color_viridis_d(option="viridis", name="Beer Style" ) +
theme_void() +
scale_size(range = c(2, 20), name=" # of beers")+
ggtitle("Number of Beers by Style\n {closest_state}")+
theme(text = element_text(size=20))+
transition_states(BeerStyle) +
enter_fade() +
enter_grow() +
ease_aes('sine-in-out')
animate(p, width = 900, height = 572, duration = 14)
#boxplots by Beer Style
#boxplot by ABV/Style
gg<-ggplot(brewcomb_type, aes(x = BeerStyle, y = ABV, fill = BeerStyle)) +
geom_boxplot() +
theme_classic()
gg+coord_flip()+ scale_fill_viridis(option="magma", discrete=TRUE, name="Beer Style" )
#boxplot by IBU/Style
gg<-ggplot(brewcomb_type, aes(x = BeerStyle, y = IBU, fill = BeerStyle)) +
geom_boxplot() +
theme_classic()
gg+coord_flip()+ scale_fill_viridis(option="magma", discrete=TRUE, name="Beer Style" )
# knit together pngs into gif for powerpoint
library(magick)
## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
list.files(path='//Users/macowner/Desktop/Data Science/gif//', pattern = '*.png', full.names = TRUE) %>%
image_read() %>% # reads each path file
image_join() %>% # joins image
image_animate(fps=4) %>% # animates, can opt for number of loops
image_write("FileName.gif") # write to current dir
## Warning in image_write(., "FileName.gif"): Writing image with 0 frames
#ipa mapping
ipa<-brewcomb_type%>% filter(BeerStyle=="American IPA") %>% group_by(BeerStyle, lat, long) %>% summarize(meanabv = mean(ABV), meanIBU = mean(IBU), beers = n()) %>% arrange(desc(beers))
## `summarise()` has grouped output by 'BeerStyle', 'lat'. You can override using the `.groups` argument.
ipa
## # A tibble: 149 × 6
## # Groups: BeerStyle, lat [140]
## BeerStyle lat long meanabv meanIBU beers
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 American IPA 33.5 -118. 0.0719 75.7 17
## 2 American IPA 40.3 -106. 0.0796 70.1 15
## 3 American IPA 28.2 -82.7 0.0707 66.3 12
## 4 American IPA 34.8 -119. 0.0745 71.1 11
## 5 American IPA 47.6 -121. 0.0593 59.5 11
## 6 American IPA 39.6 -86.3 0.0695 81.1 8
## 7 American IPA 37.7 -122. 0.0696 63.7 7
## 8 American IPA 40.6 -73.9 0.0729 76.9 7
## 9 American IPA 42.7 -71.9 0.058 55.4 7
## 10 American IPA 44.8 -93.3 0.0696 84 7
## # … with 139 more rows
ggplot() +
geom_polygon(data = usa, aes(x=long, y = lat, group = group),color = "white", fill="grey", alpha=0.3) +
geom_point( data=ipa, aes(x=long, y=lat, color=meanIBU, size=beers, alpha=0.5)) +
scale_color_viridis(option="viridis", name="IBU" ) +
theme_void() +
scale_size(range = c(1, 10), name=" # of beers")+
ggtitle("Number of American IPA Beers")